# Description ------------------------------------------------------------------
#   Analysis of zip codes common to both utilities.
#   Also: Analysis of zip codes neighboring the zip codes that are common
#     to both utilities.
#
#   We want to consider three dimensions:
#     1. Type of price: 5 types
#     2. Number of lags: 1 lead to 3 lags
#     3. Spatiotemporal fixed effects

# Setup ------------------------------------------------------------------------
# Options
options(stringsAsFactors = FALSE)
# Packages
library(pacman)
p_load(HiClimR, data.table, stringr, lubridate, lfe, fixest, magrittr, here)
# Set the number of threads for the lfe package
# options(lfe.threads = 32)
# Directory of joined bill-weather files
# dir_project = "S:/Edward/Elasticity/"
dir_project = "T:/Home/Edward/Elasticity"
# dir_project = "/Users/edwardarubin/Dropbox/Research/MyProjects/NaturalGas/"
dir_csv     = file.path(dir_project, "DataCsv")
dir_rds     = file.path(dir_project, "DataR")
dir_figures = file.path(dir_project, "Figures")
dir_output  = file.path(dir_project, "Output")
dir_results = file.path(dir_rds, "Results/Results20171025/")
# dir_results = file.path(dir_rds, "Results/Results20171025/Group1/")
# dir_results = file.path(dir_rds, "Results/Results20171025/Group2/")
# dir_results = file.path(dir_rds, "Results/Results20171025/Group3/")

# My felm function -------------------------------------------------------------
full_summary <- function(object, name) {
  # HACK: Try the variout lags that might show up. Not ideal.
  if ("stage1" %in% names(object)) {
    # Calculate F stat in first stage
    try(f_stat <- waldtest(object$stage1,
                           ~ spot_price_1week_lead3 | `spot_price_1week_lead3:utilitysocal`,
                           type = "cluster")[["F"]], silent = T)
    try(f_stat <- waldtest(object$stage1,
                           ~ spot_price_1week_lead2 | `spot_price_1week_lead2:utilitysocal`,
                           type = "cluster")[["F"]], silent = T)
    try(f_stat <- waldtest(object$stage1,
                           ~ spot_price_1week_lead1 | `spot_price_1week_lead1:utilitysocal`,
                           type = "cluster")[["F"]], silent = T)
    try(f_stat <- waldtest(object$stage1,
                           ~ spot_price_1week | `spot_price_1week:utilitysocal`,
                           type = "cluster")[["F"]], silent = T)
    try(f_stat <- waldtest(object$stage1,
                           ~ spot_price_1week_lag1 | `spot_price_1week_lag1:utilitysocal`,
                           type = "cluster")[["F"]], silent = T)
    try(f_stat <- waldtest(object$stage1,
                           ~ spot_price_1week_lag2 | `spot_price_1week_lag2:utilitysocal`,
                           type = "cluster")[["F"]], silent = T)
    try(f_stat <- waldtest(object$stage1,
                           ~ spot_price_1week_lag3 | `spot_price_1week_lag3:utilitysocal`,
                           type = "cluster")[["F"]], silent = T)
    try(f_stat <- waldtest(object$stage1,
                           ~ spot_price_1week_lag4 | `spot_price_1week_lag4:utilitysocal`,
                           type = "cluster")[["F"]], silent = T)
    try(f_stat <- waldtest(object$stage1,
                           ~ spot_price_1week_lag2 | spot_price_1week_lag2_socal,
                           type = "cluster")[["F"]], silent = T)
    # Create the first-stage list
    list1 <- list(
      coef = object$stage1 %>% summary() %>% coef(),
      fe   = object$stage1$fe %>% names(),
      N    = object$stage1$N,
      k    = object$stage1$p,
      call = object$call,
      f    = f_stat,
      lhs  = object$stage1$lhs)
  } else {
    list1 <- NULL
  }
  # Create the second-stage list
  list2 <- list(
    coef = object %>% summary() %>% coef(),
    fe   = object$fe %>% names(),
    N    = object$N,
    k    = object$p,
    call = object$call,
    lhs  = object$lhs)
  # Save
  saveRDS(list(list1, list2),
          paste0(dir_results, name, ".rds"))
}

# Function to run regressions (especially 2SLS) --------------------------------
reg_run <- function(var_lhs, var_exog, var_endo,
                    fe_hh, fe_space, fe_time, var_iv, var_cl, lag, care_sub, season_sub) {
  # Function that replaces underscored names with camel case
  # Also: remove math
  to_camel <- . %>% str_replace_all("_", " ") %>%
    str_to_title() %>%
    str_replace_all(" ", "") %>%
    str_replace("[I][(]", "") %>%
    str_replace_all("[()/]", "")
  # Create the lag suffix
  if (lag > 0) l <- paste0("_lag", lag)
  if (lag == 0) l <- ""
  if (lag < 0) l <- paste0("_lead", abs(lag))
  # Add lags to endogenous variables IVs
  if (!is.null(var_endo)) {
    var_endo_l <- ifelse(lag == 0,
                         yes = var_endo,
                         no = ifelse(str_sub(var_endo, -1) != ")",
                                     yes = paste0(var_endo, l),
                                     no = paste0(str_sub(var_endo, 1, -2), l, ")")))
    var_iv_l <- ifelse(lag == 0,
                       yes = var_iv,
                       no = ifelse(str_sub(var_iv, -1) != ")",
                                   yes = paste0(var_iv, l),
                                   no = paste0(str_sub(var_iv, 1, -2), l, ")")))
  }
  # Create the regression formula
  eqn <- paste0(
    var_lhs, " ~ ",
    # "Exogenous" variables
    paste(var_exog, collapse = " + "), " | ",
    # Fixed effects
    fe_hh, " + ",
    ifelse(fe_space == "", fe_time, paste0(fe_space, "_", fe_time)), " | ",
    # Instruments
    ifelse(is.null(var_endo),
           " 0 ",
           paste0("( ", var_endo_l, " ~ ",
                  var_iv_l, " + ",
                  var_iv_l, ":utility)")),
    " | ",
    # Clustering
    paste(var_cl, collapse = " + ")
  ) %>% as.formula()
  # CARE values for subsetting
  if (tolower(care_sub) == "all") {
    care_val <- sub_dt[,care] %>% unique()
  } else {
    care_val <- care_sub
  }
  # Season values for subsetting
  if (tolower(season_sub) == "all") {
    season_val <- sub_dt[,season] %>% unique()
  } else {
    season_val <- season_sub
  }
  # Run the regression
  est <- felm(
    eqn,
    data = sub_dt[care %in% care_val & season %in% season_val],
    psdef = F, exactDOF = T)
  # Create filename
  name <- paste(
    ifelse(is.null(var_endo), "OLS", to_camel(var_endo)),
    "_",
    ifelse(lag >= 0, paste0("Lag", lag), paste0("Lead", abs(lag))),
    "_",
    paste0(to_camel(fe_space), to_camel(fe_time)),
    to_camel(fe_hh),
    "_",
    ifelse(tolower(care_sub) == "all", "All",
           ifelse(care_sub == T, "Care", "Noncare")),
    "_",
    to_camel(season_sub),
    "_",
    sapply(X = var_exog, FUN = to_camel) %>% paste(collapse = "_"),
    sep = "_") %>% str_replace_all("__", "_")
  # Save the results
  full_summary(object = est, name = name)
}

# Function to run OLS regressions ----------------------------------------------
ols_run <- function(var_lhs, price_exog, other_exog,
                    fe_hh, fe_space, fe_time, var_cl, lag, care_sub, season_sub) {
  # Function that replaces underscored names with camel case
  # Also: remove math
  to_camel <- . %>% str_replace_all("_", " ") %>%
    str_to_title() %>%
    str_replace_all(" ", "") %>%
    str_replace("[I][(]", "") %>%
    str_replace_all("[()/]", "")
  # Create the lag suffix
  l <- ifelse(lag > 0, paste0("_lag", lag), "")
  # Add lags to endogenous variables IVs
  if (!is.null(price_exog)) {
    price_exog_l <- ifelse(lag <= 0,
                           yes = price_exog,
                           no = ifelse(str_sub(price_exog, -1) != ")",
                                       yes = paste0(price_exog, l),
                                       no = paste0(str_sub(price_exog, 1, -2), l, ")")))
  }
  # Create the regression formula
  eqn <- paste0(
    var_lhs, " ~ ",
    # (Assumed) Exogenous price with appropriate lag
    ifelse(!is.null(price_exog), paste0(price_exog_l, " + "), ""),
    # Other "exogenous" variables
    paste(other_exog, collapse = " + "), " | ",
    # Fixed effects
    fe_hh, " + ",
    ifelse(fe_space == "", fe_time, paste0(fe_space, "_", fe_time)), " | ",
    # No instruments
    " 0 | ",
    # Clustering
    paste(var_cl, collapse = " + ")
  ) %>% as.formula()
  # CARE values for subsetting
  if (tolower(care_sub) == "all") {
    care_val <- sub_dt[,care] %>% unique()
  } else {
    care_val <- care_sub
  }
  # Season values for subsetting
  if (tolower(season_sub) == "all") {
    season_val <- sub_dt[,season] %>% unique()
  } else {
    season_val <- season_sub
  }
  # Run the regression
  est <- felm(
    eqn,
    data = sub_dt[care %in% care_val & season %in% season_val],
    psdef = F, exactDOF = T)
  # Create filename
  name <- paste(
    "OLS",
    "_",
    to_camel(var_lhs),
    "_",
    sapply(X = c(price_exog, other_exog), FUN = to_camel) %>%
      paste(collapse = "_"),
    "_",
    paste0("Lag", lag),
    "_",
    paste0(to_camel(fe_space), to_camel(fe_time)),
    to_camel(fe_hh),
    "_",
    ifelse(tolower(care_sub) == "all", "All",
           ifelse(care_sub == T, "Care", "Noncare")),
    "_",
    to_camel(season_sub),
    sep = "_") %>% str_replace_all("__", "_")
  # Save the results
  full_summary(object = est, name = name)
}

# Load the common zip code panel -----------------------------------------------
# Load the common zip code data
bill_dt = fst::read_fst(
  # file.path(dir_rds, "Prices/Joint/commonCitiesZipsBills.fst"),
  file.path(dir_rds, "Prices/Joint/zipsNeighbor0.fst"),
  as.data.table = TRUE
)
# bill_dt <- readRDS(file.path(dir_rds,
#   # "Prices/Joint/commonCitiesZipsBills01PercentSample.rds"))
#   # "Prices/Joint/commonCitiesZipsBills10PercentSample.rds"))
#   "Prices/Joint/commonCitiesZipsBills.rds"))
# Load the neighbor panel (starts at 0 for 'common zips')
# bill_dt <- lapply(
#   X = 0:0,
#   # X = 0:1,
#   # X = 0:2,
#   # X = 0:3,
#   FUN = function(g) {
#     file.path(
#       dir_rds,
#       "Prices/Joint",
#       paste0(
#         "zipsNeighbor",
#         g,
#         # "_01PercentSample.rds") %>% readRDS()
#         # "_10PercentSample.rds") %>% readRDS()
#         ".rds"
#       )
#     ) %>% readRDS()
#   }) %>% rbindlist()
# Create lags and leads of flag_thm and flag_rev
setorder(bill_dt, hh_id, begin_date)
bill_dt[, `:=`(
  flag_rev_lead3 = shift(flag_rev,  3L, fill = NA, type = "lead"),
  flag_thm_lead3 = shift(flag_thm,  3L, fill = NA, type = "lead"),
  flag_rev_lead2 = shift(flag_rev,  2L, fill = NA, type = "lead"),
  flag_thm_lead2 = shift(flag_thm,  2L, fill = NA, type = "lead"),
  flag_rev_lead1 = shift(flag_rev,  1L, fill = NA, type = "lead"),
  flag_thm_lead1 = shift(flag_thm,  1L, fill = NA, type = "lead"),
  flag_rev_lag1  = shift(flag_rev,  1L, fill = NA, type = "lag"),
  flag_thm_lag1  = shift(flag_thm,  1L, fill = NA, type = "lag"),
  flag_rev_lag2  = shift(flag_rev,  2L, fill = NA, type = "lag"),
  flag_thm_lag2  = shift(flag_thm,  2L, fill = NA, type = "lag"),
  flag_rev_lag3  = shift(flag_rev,  3L, fill = NA, type = "lag"),
  flag_thm_lag3  = shift(flag_thm,  3L, fill = NA, type = "lag"),
  flag_rev_lag4  = shift(flag_rev,  3L, fill = NA, type = "lag"),
  flag_thm_lag4  = shift(flag_thm,  3L, fill = NA, type = "lag")
), by = hh_id]
# Keep bills with 28-34 days
bill_dt <- bill_dt[between(days, 28, 34)]
bill_dt <- bill_dt[between(days_lag1, 28, 34)]
bill_dt <- bill_dt[between(days_lag2, 28, 34)]
# bill_dt <- bill_dt[between(days_lag3, 28, 34)]
gc()
# Drop observations flagged due to problems re-calculating bills
# NOTE: I flagged bills with absolute percent error of 5% or more.
bill_dt <- bill_dt[
  thm > 0 & flag_thm == 0 & bill_flag == F & flag_rev == 0 &
    thm_lag1 > 0 & flag_thm_lag1 == 0 & bill_flag_lag1 == F & flag_rev_lag1 == 0 &
    thm_lag2 > 0 & flag_thm_lag2 == 0 & bill_flag_lag2 == F & flag_rev_lag2 == 0 &
    # thm_lag3 > 0 & flag_thm_lag3 == 0 & bill_flag_lag3 == F & flag_rev_lag3 == 0 &
    T]
# Rename
sub_dt <- bill_dt
rm(bill_dt); gc()

# # Enforce more conservative discontinuity at zip code --------------------------
#   # Count observations by zip code
#   zip_dt <- bill_dt[, .(n = .N), by = .(zip, utility)]
#   zip_dt %<>% dcast(zip ~ utility)
#   common_dt <- zip_dt[!(is.na(pge) | is.na(socal))]
#   # Drop observations from zip codes not covered by both utilities
#   sub_dt <- bill_dt[zip %in% common_dt$zip]
#   # Drop the full dataset (will not use for estimation)
#   rm(bill_dt); gc()
#   # Count observations by 9-digit zip code and utility
#   zip9_dt <- sub_dt[, .(n = .N), by = .(zip, zip_plus4, utility)]
#   zip9_dt <- zip9_dt[!is.na(zip_plus4)]
#   zip9_dt[, zip9 := paste0(zip, str_pad(zip_plus4, 4, "left", 0))]
#   zip9_dt[, `:=`(zip = NULL, zip_plus4 = NULL)]
#   zip9_dt %<>% dcast(zip9 ~ utility, value.var = "n")
#   zip9_dt[is.na(pge), pge := 0]
#   zip9_dt[is.na(socal), socal := 0]
#   setcolorder(zip9_dt, c(2,3,1))
#   setnames(zip9_dt,
#     old = c("pge", "socal"),
#     new = paste0("n_", c("pge", "socal")))
#   zip9_dt[, `:=`(
#     zip = str_sub(zip9, 1, 5),
#     zip_plus4 = str_sub(zip9, 6, 9))]
#   # Save
#   saveRDS(object = zip9_dt, file = paste0(dir_results, "zip9CommonCounts.rds"))

# Add exceeded summaries -------------------------------------------------------
# Calculate whether a bill exceeds the allowance
sub_dt[, exceeded := thm > allowance]
# Percentage of times a bill exceeds the allowance within a HH
sub_dt[, pct_exceeded := mean(exceeded), by = hh_id]
# Summaries by HH
ex_dt <- sub_dt[, .(hh_id, pct_exceeded)] %>% unique()
# Save ex_dt
saveRDS(object = ex_dt, file = paste0(dir_results, "exceededStats.rds"))

# OLS Regressions: Marginal price, ym FEs --------------------------------------
lapply(X = 0:3, FUN = function(x) {
  ols_run(
    var_lhs    = "log(thm_day)",
    price_exog = "log(price_mrg)",
    other_exog = "I(bill_hdd/days)",
    fe_hh      = "hh_id",
    fe_space   = "",
    fe_time    = "ym",
    var_cl     = c("hh_id", "price_cluster"),
    lag        = x,
    care_sub   = "all",
    season_sub = "all")
})

# OLS Regressions: Average price, ym FEs ---------------------------------------
lapply(X = 0:3, FUN = function(x) {
  ols_run(
    var_lhs    = "log(thm_day)",
    price_exog = "log(price_avg)",
    other_exog = "I(bill_hdd/days)",
    fe_hh      = "hh_id",
    fe_space   = "",
    fe_time    = "ym",
    var_cl     = c("hh_id", "price_cluster"),
    lag        = x,
    care_sub   = "all",
    season_sub = "all")
})

# OLS Regressions: Avg. Mrg. price, ym FEs -------------------------------------
lapply(X = 0:3, FUN = function(x) {
  ols_run(
    var_lhs    = "log(thm_day)",
    price_exog = "log(price_avg_mrg)",
    other_exog = "I(bill_hdd/days)",
    fe_hh      = "hh_id",
    fe_space   = "",
    fe_time    = "ym",
    var_cl     = c("hh_id", "price_cluster"),
    lag        = x,
    care_sub   = "all",
    season_sub = "all")
})

# OLS Regressions: Baseline price, ym FEs --------------------------------------
lapply(X = 0:3, FUN = function(x) {
  ols_run(
    var_lhs    = "log(thm_day)",
    price_exog = "log(price_base)",
    other_exog = "I(bill_hdd/days)",
    fe_hh      = "hh_id",
    fe_space   = "",
    fe_time    = "ym",
    var_cl     = c("hh_id", "price_cluster"),
    lag        = x,
    care_sub   = "all",
    season_sub = "all")
})

# OLS Regressions: Simulated marginal price, ym FEs ----------------------------
lapply(X = 0:3, FUN = function(x) {
  ols_run(
    var_lhs    = "log(thm_day)",
    price_exog = "log(mrg_sim_iv_10_14)",
    other_exog = "I(bill_hdd/days)",
    fe_hh      = "hh_id",
    fe_space   = "",
    fe_time    = "ym",
    var_cl     = c("hh_id", "price_cluster"),
    lag        = x,
    care_sub   = "all",
    season_sub = "all")
})

# OLS Regressions: Marginal price, city-ym FEs ---------------------------------
lapply(X = 0:3, FUN = function(x) {
  ols_run(
    var_lhs    = "log(thm_day)",
    price_exog = "log(price_mrg)",
    other_exog = "I(bill_hdd/days)",
    fe_hh      = "hh_id",
    fe_space   = "city",
    fe_time    = "ym",
    var_cl     = c("hh_id", "price_cluster"),
    lag        = x,
    care_sub   = "all",
    season_sub = "all")
})

# OLS Regressions: Average price, city-ym FEs ----------------------------------
lapply(X = 0:3, FUN = function(x) {
  ols_run(
    var_lhs    = "log(thm_day)",
    price_exog = "log(price_avg)",
    other_exog = "I(bill_hdd/days)",
    fe_hh      = "hh_id",
    fe_space   = "city",
    fe_time    = "ym",
    var_cl     = c("hh_id", "price_cluster"),
    lag        = x,
    care_sub   = "all",
    season_sub = "all")
})

# OLS Regressions: Avg. Mrg. price, city-ym FEs --------------------------------
lapply(X = 0:3, FUN = function(x) {
  ols_run(
    var_lhs    = "log(thm_day)",
    price_exog = "log(price_avg_mrg)",
    other_exog = "I(bill_hdd/days)",
    fe_hh      = "hh_id",
    fe_space   = "city",
    fe_time    = "ym",
    var_cl     = c("hh_id", "price_cluster"),
    lag        = x,
    care_sub   = "all",
    season_sub = "all")
})

# OLS Regressions: Baseline price, city-ym FEs ---------------------------------
lapply(X = 0:3, FUN = function(x) {
  ols_run(
    var_lhs    = "log(thm_day)",
    price_exog = "log(price_base)",
    other_exog = "I(bill_hdd/days)",
    fe_hh      = "hh_id",
    fe_space   = "city",
    fe_time    = "ym",
    var_cl     = c("hh_id", "price_cluster"),
    lag        = x,
    care_sub   = "all",
    season_sub = "all")
})

# OLS Regressions: Simulated marginal price, city-ym FEs -----------------------
lapply(X = 0:3, FUN = function(x) {
  ols_run(
    var_lhs    = "log(thm_day)",
    price_exog = "log(mrg_sim_iv_10_14)",
    other_exog = "I(bill_hdd/days)",
    fe_hh      = "hh_id",
    fe_space   = "city",
    fe_time    = "ym",
    var_cl     = c("hh_id", "price_cluster"),
    lag        = x,
    care_sub   = "all",
    season_sub = "all")
})

# IV Regressions: Marginal price, city-ym FEs ----------------------------------
lapply(X = 0:3, FUN = function(x) {
  reg_run(
    var_lhs    = "log(thm_day)",
    var_exog   = "I(bill_hdd/days)",
    var_endo   = "log(price_mrg)",
    fe_hh      = "hh_id",
    fe_space   = "city",
    fe_time    = "ym",
    var_iv     = "spot_price_1week",
    var_cl     = c("hh_id", "price_cluster"),
    lag        = x,
    care_sub   = "all",
    season_sub = "all")
})

# IV Regressions: Marginal price, city-ym FEs, no HDDS -------------------------
lapply(X = 0:3, FUN = function(x) {
  reg_run(
    var_lhs    = "log(thm_day)",
    var_exog   = "1",
    var_endo   = "log(price_mrg)",
    fe_hh      = "hh_id",
    fe_space   = "city",
    fe_time    = "ym",
    var_iv     = "spot_price_1week",
    var_cl     = c("hh_id", "price_cluster"),
    lag        = x,
    care_sub   = "all",
    season_sub = "all")
})

# IV Regressions: Average price, city-ym FEs -----------------------------------
lapply(X = 0:3, FUN = function(x) {
  reg_run(
    var_lhs    = "log(thm_day)",
    var_exog   = "I(bill_hdd/days)",
    var_endo   = "log(price_avg)",
    fe_hh      = "hh_id",
    fe_space   = "city",
    fe_time    = "ym",
    var_iv     = "spot_price_1week",
    var_cl     = c("hh_id", "price_cluster"),
    lag        = x,
    care_sub   = "all",
    season_sub = "all")
})

# IV Regressions: Avg. mrg. price, city-ym FEs ---------------------------------
lapply(X = 0:3, FUN = function(x) {
  reg_run(
    var_lhs    = "log(thm_day)",
    var_exog   = "I(bill_hdd/days)",
    var_endo   = "log(price_avg_mrg)",
    fe_hh      = "hh_id",
    fe_space   = "city",
    fe_time    = "ym",
    var_iv     = "spot_price_1week",
    var_cl     = c("hh_id", "price_cluster"),
    lag        = x,
    care_sub   = "all",
    season_sub = "all")
})

# IV Regressions: Baseline price, city-ym FEs ----------------------------------
lapply(X = 0:3, FUN = function(x) {
  reg_run(
    var_lhs    = "log(thm_day)",
    var_exog   = "I(bill_hdd/days)",
    var_endo   = "log(price_base)",
    fe_hh      = "hh_id",
    fe_space   = "city",
    fe_time    = "ym",
    var_iv     = "spot_price_1week",
    var_cl     = c("hh_id", "price_cluster"),
    lag        = x,
    care_sub   = "all",
    season_sub = "all")
})

# IV Regressions: Simulated marginal price, city-ym FEs ------------------------
lapply(X = 0:3, FUN = function(x) {
  reg_run(
    var_lhs    = "log(thm_day)",
    var_exog   = "I(bill_hdd/days)",
    var_endo   = "log(mrg_sim_iv_10_14)",
    fe_hh      = "hh_id",
    fe_space   = "city",
    fe_time    = "ym",
    var_iv     = "spot_price_1week",
    var_cl     = c("hh_id", "price_cluster"),
    lag        = x,
    care_sub   = "all",
    season_sub = "all")
})


# IV Regressions: All prices, city-ym FEs, HDDS, 1st lead ----------------------
# Copy sub_dt to tmp_dt
tmp_dt <- copy(sub_dt)
# First: clean data for leads (many NAs in price_vag_lead1)
sub_dt <- sub_dt[thm_lead1 > 0 &
                   flag_thm_lead1 == 0 & bill_flag_lead1 == F & flag_rev_lead1 == 0]
# Run the 1-lead regression
lapply(
  X = c("log(price_avg)", "log(price_avg_mrg)", "log(price_base)",
        "log(price_mrg)", "log(mrg_sim_iv_10_14)"),
  FUN = function(x) {
    reg_run(
      var_lhs    = "log(thm_day)",
      var_exog   = "I(bill_hdd/days)",
      var_endo   = x,
      fe_hh      = "hh_id",
      fe_space   = "city",
      fe_time    = "ym",
      var_iv     = "spot_price_1week",
      var_cl     = c("hh_id", "price_cluster"),
      lag        = -1,
      care_sub   = "all",
      season_sub = "all")
  })
# Exchange sub_dt for tmp_dt
rm(sub_dt)
sub_dt <- tmp_dt
rm(tmp_dt)
gc()

# IV Regressions: All prices, city-ym FEs, HDDS, 2nd lead ----------------------
# Copy sub_dt to tmp_dt
tmp_dt <- copy(sub_dt)
# First: clean data for leads (many NAs in price_vag_lead2)
sub_dt <- sub_dt[thm_lead2 > 0 &
                   flag_thm_lead2 == 0 & bill_flag_lead2 == F & flag_rev_lead2 == 0]
# Run the 2-lead regression
lapply(
  X = c("log(price_avg)", "log(price_avg_mrg)", "log(price_base)",
        "log(price_mrg)", "log(mrg_sim_iv_10_14)"),
  FUN = function(x) {
    reg_run(
      var_lhs    = "log(thm_day)",
      var_exog   = "I(bill_hdd/days)",
      var_endo   = x,
      fe_hh      = "hh_id",
      fe_space   = "city",
      fe_time    = "ym",
      var_iv     = "spot_price_1week",
      var_cl     = c("hh_id", "price_cluster"),
      lag        = -2,
      care_sub   = "all",
      season_sub = "all")
  })
# Exchange sub_dt for tmp_dt
rm(sub_dt)
sub_dt <- tmp_dt
rm(tmp_dt)
gc()

# IV Regressions: All prices, city-ym FEs, HDDS, 3rd lead ----------------------
# Copy sub_dt to tmp_dt
tmp_dt <- copy(sub_dt)
# First: clean data for leads (many NAs in price_vag_lead3)
sub_dt <- sub_dt[thm_lead3 > 0 &
                   flag_thm_lead3 == 0 & bill_flag_lead3 == F & flag_rev_lead3 == 0]
# Run the 1-lead regression
lapply(
  X = c("log(price_avg)", "log(price_avg_mrg)", "log(price_base)",
        "log(price_mrg)", "log(mrg_sim_iv_10_14)"),
  FUN = function(x) {
    reg_run(
      var_lhs    = "log(thm_day)",
      var_exog   = "I(bill_hdd/days)",
      var_endo   = x,
      fe_hh      = "hh_id",
      fe_space   = "city",
      fe_time    = "ym",
      var_iv     = "spot_price_1week",
      var_cl     = c("hh_id", "price_cluster"),
      lag        = -3,
      care_sub   = "all",
      season_sub = "all")
  })
# Exchange sub_dt for tmp_dt
rm(sub_dt)
sub_dt <- tmp_dt
rm(tmp_dt)
gc()

# IV Regressions: All prices, city-ym FEs, HDDS, 4th lag -----------------------
# Copy sub_dt to tmp_dt
tmp_dt <- copy(sub_dt)
# First: clean data for leads (many NAs in price_vag_lead3)
sub_dt <- sub_dt[thm_lag4 > 0 &
                   flag_thm_lag4 == 0 & bill_flag_lag4 == F & flag_rev_lag4 == 0]
# Run the 1-lead regression
lapply(
  X = c("log(price_avg)", "log(price_avg_mrg)", "log(price_base)",
        "log(price_mrg)", "log(mrg_sim_iv_10_14)"),
  FUN = function(x) {
    reg_run(
      var_lhs    = "log(thm_day)",
      var_exog   = "I(bill_hdd/days)",
      var_endo   = x,
      fe_hh      = "hh_id",
      fe_space   = "city",
      fe_time    = "ym",
      var_iv     = "spot_price_1week",
      var_cl     = c("hh_id", "price_cluster"),
      lag        = 4,
      care_sub   = "all",
      season_sub = "all")
  })
# Exchange sub_dt for tmp_dt
rm(sub_dt)
sub_dt <- tmp_dt
rm(tmp_dt)
gc()

# IV Regressions: All prices, city-ym FEs, no HDDS, 2nd lag --------------------
lapply(
  X = c("log(price_avg)", "log(price_avg_mrg)", "log(price_base)",
        "log(price_mrg)", "log(mrg_sim_iv_10_14)"),
  FUN = function(x) {
    reg_run(
      var_lhs    = "log(thm_day)",
      var_exog   = "1",
      var_endo   = x,
      fe_hh      = "hh_id",
      fe_space   = "city",
      fe_time    = "ym",
      var_iv     = "spot_price_1week",
      var_cl     = c("hh_id", "price_cluster"),
      lag        = 2,
      care_sub   = "all",
      season_sub = "all")
  })

# IV Regressions: All prices, city-week FEs, 2nd lag ---------------------------
lapply(
  X = c("log(price_avg)", "log(price_avg_mrg)", "log(price_base)",
        "log(price_mrg)", "log(mrg_sim_iv_10_14)"),
  FUN = function(x) {
    reg_run(
      var_lhs    = "log(thm_day)",
      var_exog   = "I(bill_hdd/days)",
      var_endo   = x,
      fe_hh      = "hh_id",
      fe_space   = "city",
      fe_time    = "yw",
      var_iv     = "spot_price_1week",
      var_cl     = c("hh_id", "price_cluster"),
      lag        = 2,
      care_sub   = "all",
      season_sub = "all")
  })

# IV Regressions: All prices, zip-ym FEs, 2nd lag ------------------------------
lapply(
  X = c("log(price_avg)", "log(price_avg_mrg)", "log(price_base)",
        "log(price_mrg)", "log(mrg_sim_iv_10_14)"),
  FUN = function(x) {
    reg_run(
      var_lhs    = "log(thm_day)",
      var_exog   = "I(bill_hdd/days)",
      var_endo   = x,
      fe_hh      = "hh_id",
      fe_space   = "zip",
      fe_time    = "ym",
      var_iv     = "spot_price_1week",
      var_cl     = c("hh_id", "price_cluster"),
      lag        = 2,
      care_sub   = "all",
      season_sub = "all")
  })

# IV Regressions: All prices, zip-week FEs, 2nd lag ----------------------------
lapply(
  X = c("log(price_avg)", "log(price_avg_mrg)", "log(price_base)",
        "log(price_mrg)", "log(mrg_sim_iv_10_14)"),
  FUN = function(x) {
    reg_run(
      var_lhs    = "log(thm_day)",
      var_exog   = "I(bill_hdd/days)",
      var_endo   = x,
      fe_hh      = "hh_id",
      fe_space   = "zip",
      fe_time    = "yw",
      var_iv     = "spot_price_1week",
      var_cl     = c("hh_id", "price_cluster"),
      lag        = 2,
      care_sub   = "all",
      season_sub = "all")
  })

# IV Regressions: Het. marginal price, city-ym FEs, 2 lags ---------------------
mapply(FUN = function(x, y) {
  reg_run(
    var_lhs    = "log(thm_day)",
    var_exog   = "I(bill_hdd/days)",
    var_endo   = "log(price_mrg)",
    fe_hh      = "hh_id",
    fe_space   = "city",
    fe_time    = "ym",
    var_iv     = "spot_price_1week",
    var_cl     = c("hh_id", "price_cluster"),
    lag        = 2,
    care_sub   = x,
    season_sub = y)
},
x = list("all", "all", T, T, F, F, T, F),
y = list("summer", "winter", "summer", "winter", "summer", "winter", "all", "all"),
SIMPLIFY = F)

# IV Regressions: Het. average price, city-ym FEs, 2 lags ----------------------
mapply(FUN = function(x, y) {
  reg_run(
    var_lhs    = "log(thm_day)",
    var_exog   = "I(bill_hdd/days)",
    var_endo   = "log(price_avg)",
    fe_hh      = "hh_id",
    fe_space   = "city",
    fe_time    = "ym",
    var_iv     = "spot_price_1week",
    var_cl     = c("hh_id", "price_cluster"),
    lag        = 2,
    care_sub   = x,
    season_sub = y)
},
x = list("all", "all", T, T, F, F, T, F),
y = list("summer", "winter", "summer", "winter", "summer", "winter", "all", "all"),
SIMPLIFY = F)

# IV Regressions: Het. avg. mrg. price, city-ym FEs, 2 lags --------------------
mapply(FUN = function(x, y) {
  reg_run(
    var_lhs    = "log(thm_day)",
    var_exog   = "I(bill_hdd/days)",
    var_endo   = "log(price_avg_mrg)",
    fe_hh      = "hh_id",
    fe_space   = "city",
    fe_time    = "ym",
    var_iv     = "spot_price_1week",
    var_cl     = c("hh_id", "price_cluster"),
    lag        = 2,
    care_sub   = x,
    season_sub = y)
},
x = list("all", "all", T, T, F, F, T, F),
y = list("summer", "winter", "summer", "winter", "summer", "winter", "all", "all"),
SIMPLIFY = F)

# IV Regressions: Het. baseline price, city-ym FEs, 2 lags ---------------------
mapply(FUN = function(x, y) {
  reg_run(
    var_lhs    = "log(thm_day)",
    var_exog   = "I(bill_hdd/days)",
    var_endo   = "log(price_base)",
    fe_hh      = "hh_id",
    fe_space   = "city",
    fe_time    = "ym",
    var_iv     = "spot_price_1week",
    var_cl     = c("hh_id", "price_cluster"),
    lag        = 2,
    care_sub   = x,
    season_sub = y)
},
x = list("all", "all", T, T, F, F, T, F),
y = list("summer", "winter", "summer", "winter", "summer", "winter", "all", "all"),
SIMPLIFY = F)

# IV Regressions: Het. simulated marginal price, city-ym FEs, 2 lags -----------
mapply(FUN = function(x, y) {
  reg_run(
    var_lhs    = "log(thm_day)",
    var_exog   = "I(bill_hdd/days)",
    var_endo   = "log(mrg_sim_iv_10_14)",
    fe_hh      = "hh_id",
    fe_space   = "city",
    fe_time    = "ym",
    var_iv     = "spot_price_1week",
    var_cl     = c("hh_id", "price_cluster"),
    lag        = 2,
    care_sub   = x,
    season_sub = y)
},
x = list("all", "all", T, T, F, F, T, F),
y = list("summer", "winter", "summer", "winter", "summer", "winter", "all", "all"),
SIMPLIFY = F)

# Temporal subset regressions --------------------------------------------------
# Iterature over sets of months, estimating model for each set:
# The 2nd lag of marginal price with city-ym and some other FEs,
# subsetted by the start date's calendar month
# First: Build monthly sets
step_list <- c(
  1:12 %>% str_pad(2, "left", "0") %>% as.list(),
  list(c("01", "02")),
  list(c("03", "04")),
  list(c("05", "06")),
  list(c("07", "08")),
  list(c("09", "10")),
  list(c("11", "12")),
  list(c("01", "02", "03")),
  list(c("04", "05", "06")),
  list(c("07", "08", "09")),
  list(c("10", "11", "12")),
  list(c(1:3, 10:12) %>% str_pad(2, "left", "0")),
  list(c(4:9) %>% str_pad(2, "left", "0")),
  NULL)
# Run regressions
for (x in step_list) {
  felm(log(thm_day) ~
         I(bill_hdd/days) |
         # city_ym |
         # zip9 + city_ym |
         hh_id + city_ym |
         (log(price_mrg_lag2) ~
            spot_price_1week_lag2 +
            spot_price_1week_lag2:utility) |
         hh_id + price_cluster,
       data = sub_dt[month %in% x],
       # data = sub_dt[month %in% x & between(year, 2011, 2014)],
       psdef = F, exactDOF = T) %>%
    # full_summary(paste0("Monthly/LogPriceMrg__Lag2__CityYm__",
    # full_summary(paste0("Monthly/LogPriceMrg__Lag2__CityYm_Zip9__",
    full_summary(paste0("Monthly/LogPriceMrg__Lag2__CityYm_HhId__",
                        "All__All__BillHddDays__Month_", paste(x, collapse = "_")))
  # Print
  print(paste("Done", x))
}

# Temporal subset regressions with 'common' fixed effects ----------------------
# Drop 3 observations that are missing
sub_dt <- sub_dt[!is.na(spot_price_1week_lag2)]
# Residualize variables of interest on fixed effects and HDDs
reg_resid <- felm(
  log(thm_day) |
    log(price_mrg_lag2) |
    spot_price_1week_lag2 |
    I(spot_price_1week_lag2*(utility=="socal")) ~
    I(bill_hdd/days) |
    hh_id + city_ym,
  data = sub_dt, psdef = F, exactDOF = T)
# Create data table of residuals
resid_dt <- reg_resid$residuals %>% data.table()
# Change names (felm doesn't like parentheses)
setnames(resid_dt, c("log_thm_day", "log_price_mrg_lag2",
                     "spot_price_1week_lag2", "spot_price_1week_lag2_socal"))
# Add a few more variables
resid_dt[, `:=`(
  month = sub_dt[, month],
  price_cluster = sub_dt[, price_cluster],
  hh_id = sub_dt[, hh_id]
)]
# Iterature over sets of months, estimating model for each set:
# The 2nd lag of marginal price with city-ym and some other FEs,
# subsetted by the start date's calendar month
# First: Build monthly sets
step_list <- c(
  1:12 %>% str_pad(2, "left", "0") %>% as.list(),
  list(c("01", "02")),
  list(c("03", "04")),
  list(c("05", "06")),
  list(c("07", "08")),
  list(c("09", "10")),
  list(c("11", "12")),
  list(c("01", "02", "03")),
  list(c("04", "05", "06")),
  list(c("07", "08", "09")),
  list(c("10", "11", "12")),
  list(c(1:3, 10:12) %>% str_pad(2, "left", "0")),
  list(c(4:9) %>% str_pad(2, "left", "0")),
  NULL)
# Run regressions
for (x in step_list) {
  felm(log_thm_day ~
         -1 |
         0 |
         (log_price_mrg_lag2 ~
            spot_price_1week_lag2 +
            spot_price_1week_lag2_socal) |
         hh_id + price_cluster,
       data = resid_dt[month %in% x],
       psdef = F, exactDOF = T) %>%
    full_summary(paste0("MonthlyCommonFE/LogPriceMrg__Lag2__CityYm_HhId__",
                        "All__All__BillHddDays__Month_", paste(x, collapse = "_")))
  # Print
  print(paste("Done", x))
}

# Simulated IV Regressions: city-ym FEs ----------------------------------------
# No lags
felm(price_mrg ~ mrg_sim_iv_10_14 + I(bill_hdd/days) |
       hh_id + city_month |
       0 |
       hh_id + price_cluster,
     data = sub_dt,
     psdef = F, exactDOF = T) %>% full_summary("SIM__10_14__Lag0")
felm(price_mrg ~ mrg_sim_iv_11_13 + I(bill_hdd/days) |
       hh_id + city_month |
       0 |
       hh_id + price_cluster,
     data = sub_dt,
     psdef = F, exactDOF = T) %>% full_summary("SIM__11_13__Lag0")
# 1 lag
felm(price_mrg ~ mrg_sim_iv_10_14_lag1 + I(bill_hdd/days) |
       hh_id + city_month |
       0 |
       hh_id + price_cluster,
     data = sub_dt,
     psdef = F, exactDOF = T) %>% full_summary("SIM__10_14__Lag1")
felm(price_mrg ~ mrg_sim_iv_11_13_lag1 + I(bill_hdd/days) |
       hh_id + city_month |
       0 |
       hh_id + price_cluster,
     data = sub_dt,
     psdef = F, exactDOF = T) %>% full_summary("SIM__11_13__Lag1")
# 2 lag
felm(price_mrg ~ mrg_sim_iv_10_14_lag2 + I(bill_hdd/days) |
       hh_id + city_month |
       0 |
       hh_id + price_cluster,
     data = sub_dt,
     psdef = F, exactDOF = T) %>% full_summary("SIM__10_14__Lag2")
felm(price_mrg ~ mrg_sim_iv_11_13_lag2 + I(bill_hdd/days) |
       hh_id + city_month |
       0 |
       hh_id + price_cluster,
     data = sub_dt,
     psdef = F, exactDOF = T) %>% full_summary("SIM__11_13__Lag2")
# 1 lag
felm(price_mrg ~ mrg_sim_iv_10_14_lag3 + I(bill_hdd/days) |
       hh_id + city_month |
       0 |
       hh_id + price_cluster,
     data = sub_dt,
     psdef = F, exactDOF = T) %>% full_summary("SIM__10_14__Lag3")
felm(price_mrg ~ mrg_sim_iv_11_13_lag3 + I(bill_hdd/days) |
       hh_id + city_month |
       0 |
       hh_id + price_cluster,
     data = sub_dt,
     psdef = F, exactDOF = T) %>% full_summary("SIM__11_13__Lag3")

# Correlation of prices and bills ----------------------------------------------
# Correlation of the price measures
# NOTE: Simulated marginal price is missing some values, since it requires
# the 10th through 14th lags to exist.
price_cor <- fastCor(
  xt = sub_dt[!is.na(mrg_sim_iv_10_14),
              .(price_mrg, price_avg, price_avg_mrg, price_base, mrg_sim_iv_10_14)] %>%
    as.matrix(),
  nSplit = 1, upperTri = F)
# Correlation of lags for marginal price
lag_cor_mrg <- fastCor(
  xt = sub_dt[, .(price_mrg, price_mrg_lag1, price_mrg_lag2, price_mrg_lag3,
                  price_mrg_lag4, price_mrg_lag5)] %>% as.matrix() %>% na.omit(),
  nSplit = 1, upperTri = F)
# Correlation of lags for baseline price
lag_cor_base <- fastCor(
  xt = sub_dt[, .(price_base, price_base_lag1, price_base_lag2, price_base_lag3,
                  price_base_lag4, price_base_lag5)] %>% as.matrix() %>% na.omit(),
  nSplit = 1, upperTri = F)
# Correlation of lags for total bill
lag_cor_bill <- fastCor(
  xt = sub_dt[, .(total_bill, total_bill_lag1, total_bill_lag2, total_bill_lag3,
                  total_bill_lag4, total_bill_lag5)] %>% as.matrix() %>% na.omit(),
  nSplit = 1, upperTri = F)
# Correlation of lags for therms per day
lag_cor_thm_day <- fastCor(
  xt = sub_dt[, .(thm_day, thm_day_lag1, thm_day_lag2, thm_day_lag3,
                  thm_day_lag4, thm_day_lag5)] %>% as.matrix() %>% na.omit(),
  nSplit = 1, upperTri = F)
# Save
saveRDS(
  object = price_cor,
  file = paste0(dir_results, "cor_price.rds"))
saveRDS(
  object = lag_cor_mrg,
  file = paste0(dir_results, "cor_lag_mrg.rds"))
saveRDS(
  object = lag_cor_base,
  file = paste0(dir_results, "cor_lag_base.rds"))
saveRDS(
  object = lag_cor_bill,
  file = paste0(dir_results, "cor_lag_bill.rds"))
saveRDS(
  object = lag_cor_thm_day,
  file = paste0(dir_results, "cor_lag_thm_day.rds"))

# Summary statistics -----------------------------------------------------------
sub_dt[, uniqueN(zip), by = utility]
sub_dt[, uniqueN(zip9), by = utility]
sub_dt[, uniqueN(hh_id), by = utility]
sub_dt[, N, by = utility]
sub_dt[, sub(total_bill), by = utility]

# Balance by utility, care, and season -----------------------------------------
# Quick and simple formatting functions
num <- . %>% round(2) %>% format(nsmall = 2) %>% str_replace_all("-", "$-$")
sdf <- . %>% format(trim = F, digits = 3) %>% paste0("[", ., "]")
n <- . %>% format(trim = F, digits = 2, nsmall = 1,
                  scientific = F, big.mark = ",")
# Function that creates a vector, alternating between the two vectors
alt <- function(x,y) {
  lapply(X = 1:length(x), FUN = function(i) {
    c(x[i], y[i])
  }) %>% unlist()
}
# Add variable for daily allowance
sub_dt[, allowance_days := allowance / days]
# Take averages, standard deviations, and counts
avg_ucs <- sub_dt[,
                  .(thm, days, allowance, total_bill, bill_hdd, utility, care, season)][,
                                                                                        lapply(.SD, FUN = mean),
                                                                                        by = .(utility, care, season)]
sd_ucs <- sub_dt[,
                 .(thm, days, allowance, total_bill, bill_hdd, care, utility, care, season)][,
                                                                                             lapply(.SD, FUN = sd),
                                                                                             by = .(utility, care, season)]
n_ucs <- sub_dt[, .N, by = .(utility, care, season)]
sd_cs <- sub_dt[,
                .(thm, days, allowance, total_bill, bill_hdd, care, care, season)][,
                                                                                   lapply(.SD, FUN = sd),
                                                                                   by = .(care, season)]
# Set order
setorder(avg_ucs, season, care, utility)
setorder(sd_ucs, season, care, utility)
setorder(n_ucs, season, care, utility)
setorder(sd_cs, season, care)
# Iterate over cross-utility groups of summer and CARE
bal_mat <- lapply(X = 1:4, FUN = function(i) {
  # The group's row indices from the summary tables
  a <- i * 2 - 1
  b <- i * 2
  # Calculate the difference
  the_diff <- (avg_ucs[a, 4:8] - avg_ucs[b, 4:8]) %>% as.numeric()
  # Create the entry
  cbind(
    alt(avg_ucs[a,4:8] %>% num(), sd_ucs[a,4:8] %>% sdf()),
    alt(avg_ucs[b,4:8] %>% num(), sd_ucs[b,4:8] %>% sdf()),
    alt(the_diff %>% num(), sd_cs[i,3:7] %>% sdf()),
    NULL)
}) %>% do.call(cbind, .)
# Add number of observations
bal_mat %<>% rbind(.,
                   lapply(X = 1:4, FUN = function(i) {
                     c(n_ucs[(2*i - 1):(2*i),N] %>% n(), "")
                   }) %>% unlist())
#  Save
saveRDS(
  object = bal_mat,
  file = paste0(dir_results, "balanceMatrix.rds"))
